options(java.parameters = "-Xmx2048m",
stringsAsFactors = FALSE,
encoding = 'UTF-8')
suppressPackageStartupMessages({
# ISLR2
library(ISLR2)
# Schmidt
library(far)
# kernel density
library(ks)
# DM
library(zip)
library(openxlsx)
library(readxl)
library(writexl)
library(RcppRoll)
library(plyr)
library(stringi)
library(feather)
library(RODBC)
library(MASS)
library(car)
library(data.table)
library(lubridate)
library(plotly)
library(pROC)
library(tidymodels)
library(tidyverse)
})
environment variables.
color.origin <- c('1' = '#19d3f3', '2' = '#FFB6C1', '3' = '#00FF00')
Scatter-plot matrix, or splom.
auto.splom <- plot_ly(ISLR2::Auto,
color = as.integer(ISLR2::Auto$origin),
colors = color.origin) %>%
add_trace(
type = 'splom',
dimensions = list(
list(label = 'mpg', values = ~mpg),
list(label = 'cyl', values = ~cylinders),
list(label = 'displace', values = ~displacement),
list(label = 'hp', values = ~horsepower),
list(label = 'weight', values = ~weight),
list(label = 'acc', values = ~acceleration),
list(label = 'year', values = ~year),
list(label = 'origin', values = ~origin)
),
text = ~origin,
marker = list(
size = 3
)
) %>%
hide_colorbar() %>%
layout(
title = 'Scatter-plot Matrix of Auto',
hovermode = 'closest',
dragmode = 'select',
plot_bgcolor = 'rgba(240,240,240, 0.95)',
autosize = TRUE,
xaxis = list(domain = NULL, showline = F, zeroline = F, gridcolor = '#ffff', ticklen = 2),
yaxis=list(domain = NULL, showline = F, zeroline = F, gridcolor = '#ffff', ticklen = 2)
)
auto.splom
Correlation matrix.
cor(ISLR2::Auto[, 1:8])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
Multivariate linear regression.
auto.dum <- ISLR2::Auto[, 1:8] %>%
mutate(American = if_else(origin == 1, 1, 0),
European = if_else(origin == 2, 1, 0)) %>%
select(-origin)
auto.lr <- lm(mpg ~ ., data = auto.dum)
summary(auto.lr)
##
## Call:
## lm(formula = mpg ~ ., data = auto.dum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0095 -2.0785 -0.0982 1.9856 13.3608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.510e+01 4.681e+00 -3.226 0.00136 **
## cylinders -4.897e-01 3.212e-01 -1.524 0.12821
## displacement 2.398e-02 7.653e-03 3.133 0.00186 **
## horsepower -1.818e-02 1.371e-02 -1.326 0.18549
## weight -6.710e-03 6.551e-04 -10.243 < 2e-16 ***
## acceleration 7.910e-02 9.822e-02 0.805 0.42110
## year 7.770e-01 5.178e-02 15.005 < 2e-16 ***
## American -2.853e+00 5.527e-01 -5.162 3.93e-07 ***
## European -2.232e-01 5.661e-01 -0.394 0.69355
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.307 on 383 degrees of freedom
## Multiple R-squared: 0.8242, Adjusted R-squared: 0.8205
## F-statistic: 224.5 on 8 and 383 DF, p-value: < 2.2e-16
Linear regression diagnostic plot.
par(mfrow = c(2, 2))
plot(auto.lr)
Stepwise regression.
auto.lr.inter <- lm(mpg ~ 1, data = auto.dum)
auto.lr.both <- stats::step(auto.lr.inter, scope = formula(auto.lr), direction = 'both', trace = 0)
summary(auto.lr.both)
##
## Call:
## lm(formula = mpg ~ weight + year + American + displacement +
## horsepower + cylinders, data = auto.dum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0599 -2.0866 -0.1227 1.9076 13.5115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.839999 4.113203 -3.365 0.000843 ***
## weight -0.006505 0.000564 -11.534 < 2e-16 ***
## year 0.777686 0.050646 15.355 < 2e-16 ***
## American -2.751258 0.481795 -5.710 2.26e-08 ***
## displacement 0.023493 0.007598 3.092 0.002133 **
## horsepower -0.024557 0.010705 -2.294 0.022329 *
## cylinders -0.496154 0.319883 -1.551 0.121712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.301 on 385 degrees of freedom
## Multiple R-squared: 0.8238, Adjusted R-squared: 0.8211
## F-statistic: 300.1 on 6 and 385 DF, p-value: < 2.2e-16
weight, year, American.
auto.lr.both3 <- lm(mpg ~ weight + year + American, data = auto.dum)
summary(auto.lr.both3)
##
## Call:
## lm(formula = mpg ~ weight + year + American, data = auto.dum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4841 -2.1141 -0.0192 1.7795 13.6261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.639e+01 3.921e+00 -4.180 3.60e-05 ***
## weight -5.893e-03 2.593e-04 -22.731 < 2e-16 ***
## year 7.725e-01 4.823e-02 16.017 < 2e-16 ***
## American -2.095e+00 4.361e-01 -4.804 2.22e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.334 on 388 degrees of freedom
## Multiple R-squared: 0.8189, Adjusted R-squared: 0.8176
## F-statistic: 585 on 3 and 388 DF, p-value: < 2.2e-16
Successive orthogonalization.
auto.ortho <- auto.dum[, -1] %>%
mutate(x0 = 1, .before = 1) %>%
as.matrix() %>%
far::orthonormalization(basis = FALSE, norm = FALSE)
auto.lr.ortho.coef <- c()
for (i in 2:ncol(auto.ortho)) {
auto.lr.res <- lm(auto.dum$mpg ~ auto.ortho[, i])
auto.lr.ortho.coef[names(auto.dum)[i]] <- auto.lr.res$coefficients[2]
}
sort(auto.lr.ortho.coef)
## cylinders American European horsepower displacement acceleration
## -3.558078368 -2.746947282 -0.223225868 -0.059935368 -0.051118499 -0.029104714
## weight year
## -0.005277173 0.753367180
cylinders, American, year.
auto.lr.ortho3 <- lm(mpg ~ cylinders + American + year, data = auto.dum)
summary(auto.lr.ortho3)
##
## Call:
## lm(formula = mpg ~ cylinders + American + year, data = auto.dum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0529 -2.5583 -0.2539 2.2336 14.5046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.85015 4.79913 -4.345 1.78e-05 ***
## cylinders -2.44655 0.15958 -15.331 < 2e-16 ***
## American -3.03313 0.53189 -5.703 2.34e-08 ***
## year 0.78415 0.05908 13.274 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.017 on 388 degrees of freedom
## Multiple R-squared: 0.7371, Adjusted R-squared: 0.7351
## F-statistic: 362.6 on 3 and 388 DF, p-value: < 2.2e-16
High salary.
plot(sort(ISLR2::Hitters$Salary))
Set threshold as 500.
hitters.class <- ISLR2::Hitters %>%
filter(!is.na(Salary)) %>%
mutate(Salary = if_else(Salary > 500, 1, 0),
League = if_else(League == 'A', 1, 0),
Division = if_else(Division == 'E', 1, 0),
NewLeague = if_else(NewLeague == 'A', 1, 0))
Stepwise.
hitters.lr.inter <- lm(Salary ~ 1, data = hitters.class)
hitters.lr.all <- lm(Salary ~ ., data = hitters.class)
hitters.lr.both <- stats::step(hitters.lr.inter, scope = formula(hitters.lr.all), direction = 'both', trace = 0)
summary(hitters.lr.both)
##
## Call:
## lm(formula = Salary ~ CHits + Hits + Division + NewLeague + Walks,
## data = hitters.class)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97970 -0.26103 -0.05952 0.31275 1.33527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.447e-01 7.008e-02 -3.491 0.000566 ***
## CHits 3.092e-04 3.834e-05 8.063 2.83e-14 ***
## Hits 3.219e-03 6.590e-04 4.885 1.82e-06 ***
## Division 1.114e-01 4.771e-02 2.334 0.020372 *
## NewLeague -1.072e-01 4.790e-02 -2.239 0.026023 *
## Walks 2.505e-03 1.375e-03 1.822 0.069630 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3853 on 257 degrees of freedom
## Multiple R-squared: 0.4067, Adjusted R-squared: 0.3951
## F-statistic: 35.23 on 5 and 257 DF, p-value: < 2.2e-16
CHits, Hits.
Linear regression.
hitters.lr <- lm(Salary ~ CHits + Hits, data = hitters.class)
summary(hitters.lr)
##
## Call:
## lm(formula = Salary ~ CHits + Hits, data = hitters.class)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97748 -0.26241 -0.05775 0.31725 1.21810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.249e-01 6.447e-02 -3.488 0.000571 ***
## CHits 3.233e-04 3.861e-05 8.375 3.47e-15 ***
## Hits 3.869e-03 5.546e-04 6.978 2.49e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3937 on 260 degrees of freedom
## Multiple R-squared: 0.3734, Adjusted R-squared: 0.3686
## F-statistic: 77.48 on 2 and 260 DF, p-value: < 2.2e-16
Logistic regression.
hitters.logistic <- glm(Salary ~ CHits + Hits, family = binomial(link = 'logit'), data = hitters.class)
summary(hitters.logistic)
##
## Call:
## glm(formula = Salary ~ CHits + Hits, family = binomial(link = "logit"),
## data = hitters.class)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4942 -0.6542 -0.3463 0.7403 2.9237
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.3009111 0.5536668 -7.768 7.97e-15 ***
## CHits 0.0020230 0.0003179 6.364 1.97e-10 ***
## Hits 0.0226544 0.0040287 5.623 1.87e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 358.79 on 262 degrees of freedom
## Residual deviance: 241.51 on 260 degrees of freedom
## AIC: 247.51
##
## Number of Fisher Scoring iterations: 5
Accurate rate.
hitters.pred.logistic <- hitters.class %>%
mutate(p = hitters.logistic$fitted.values,
Salary_p = if_else(p > 0.5, 1, 0),
acc = if_else(Salary == Salary_p, 1, 0))
(hitters.logistic.acc <- sum(hitters.pred.logistic$acc) / nrow(hitters.pred.logistic))
## [1] 0.8136882
hitters.pred.lr <- hitters.class %>%
mutate(pred = hitters.lr$fitted.values,
Salary_p = round(pred),
acc = if_else(Salary == Salary_p, 1, 0))
(hitters.lr.acc <- sum(hitters.pred.lr$acc) / nrow(hitters.pred.lr))
## [1] 0.8174905
Kernel density estimation.
hitters.error.select <- hitters.pred.logistic %>%
filter(acc == 0) %>%
select(CHits, Hits)
hitters.kde <- ks::kde(x = hitters.error.select, verbose = TRUE)
## ================================================================================
Plot kernel density estimation.
plot(hitters.kde, display = 'slice', cont = c(25, 50, 75, 100))